Tests de independencia
Cargar librerÃas
library(dplyr)
library(ggplot2)Load the data
hd_data <- read.csv("data/Cleveland_hd.csv")Data processing
hd_pr <- hd_data %>%
mutate(sex = as.factor(ifelse(sex == 0, "mujer", "hombre")),
hd = ifelse(class > 0, 1, 0),
hd_etiqueta = as.factor(ifelse(hd == 0, "No enfermedad", "Enfermedad"))) %>%
relocate(c(hd, hd_etiqueta), .after = sex) %>%
select(age:hd_etiqueta, trestbps, chol, thalach)Chi cuadrado
Frecuencias observadas frente a las frecuencias esperadas.
Caso de una variables. Ejemplo : moneda (legal) vs moneda trucada.
# Fair coin
n = 100
moneda <- sample(c("cara","cruz"), size = n, replace = T)
table(moneda)## moneda
## cara cruz
## 52 48
chisq.test(table(moneda))##
## Chi-squared test for given probabilities
##
## data: table(moneda)
## X-squared = 0.16, df = 1, p-value = 0.6892
# Dataviz for fair coin :
datos <- as.data.frame(table(moneda)) %>% #convierte la tabla en dataframe
rename("res" = moneda, "freq" = Freq)
# dataViz
datos %>%
ggplot(aes(x = res, y = freq, fill = res)) +
geom_bar(stat = "identity", width = 0.6, color = "black") +
geom_hline(yintercept = n/2, color = "red", linetype = "dashed", size = 1) +
scale_fill_manual(values = c("skyblue", "salmon")) +
labs(title = "Resultados de 100 lanzamientos",
subtitle = expression("Hipótesis nula: " ~ H[0] * ": p = 0.5"),
x = "",
y = "Frecuencia observada") +
theme_minimal(base_size = 14) +
theme(legend.position = "none")# Rigged coin
n = 100000
t_moneda <- sample(c(0,1), n, replace = T, prob = c(0.6, 0.4))
table(t_moneda)## t_moneda
## 0 1
## 59884 40116
chisq.test(table(t_moneda))##
## Chi-squared test for given probabilities
##
## data: table(t_moneda)
## X-squared = 3907.7, df = 1, p-value < 2.2e-16
# Dataviz for rigged coin :
datos <- as.data.frame(table(t_moneda)) %>% #convierte la tabla en dataframe
rename("res" = t_moneda, "freq" = Freq) %>%
mutate(res = ifelse(res == 0, "cara", "cruz"))
# dataViz
ggplot(datos, aes(x = res, y = freq, fill = res)) +
geom_bar(stat = "identity", width = 0.6, color = "black") +
geom_hline(yintercept = n/2, color = "red", linetype = "dashed", size = 1) +
scale_fill_manual(values = c("skyblue", "salmon")) +
labs(title = "Resultados de 100000 lanzamientos",
subtitle = expression("Hipótesis nula: " ~ H[0] * ": p = 0.5"),
x = "",
y = "Frecuencia observada") +
theme_minimal(base_size = 14) +
theme(legend.position = "none")Caso de dos variables. Ejemplo : ¿ hay asociación entre el sexo (variable cualitativa) y la condición sano/enfermo (variable cualitativa) ?
Planteamiento de hipótesis :
H0: There is no association between sex and hd (= entre hd y sex).
H1: There is association between sex and hd (= entre hd y sex).
# obtener valores de la variable 'hd'
vals_hd <- hd_pr %>% pull(hd_etiqueta)
# obtener valores de la variable 'sex'
vals_sex <- hd_pr %>% pull(sex)
# test de independencia
hd_by_sex <- chisq.test(vals_sex, vals_hd)
hd_by_sex##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: vals_sex and vals_hd
## X-squared = 22.043, df = 1, p-value = 2.667e-06
hd_pr %>%
ggplot(aes(x = hd_etiqueta, fill = sex)) +
geom_bar(position = "fill") +
labs(x = "", y= "porcentaje %")t test
El t test se realiza cuando tenemos una variable dependiente y una variable independiente categórica con dos grupos.
Examina la variable ‘chol’ entre los grupos enfermedad/sanos.
condition <- hd_pr %>%
pull(hd_etiqueta)
colesterol <- hd_pr %>%
pull(chol)
chol_by_condition <- t.test(colesterol ~ condition)
chol_by_condition##
## Welch Two Sample t-test
##
## data: colesterol by condition
## t = 1.4924, df = 298.64, p-value = 0.1366
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.815018 20.484170
## sample estimates:
## mean in group Enfermedad mean in group No enfermedad
## 251.4748 242.6402
hd_pr %>%
ggplot(aes(x = hd_etiqueta, y = chol, color = hd_etiqueta)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(width = 0.3, alpha = 0.4) +
labs(x = "", y = "edad") +
theme(legend.position = "none")Ejercicios :
1. Examina la variable ‘age’ entre los grupos enfermedad/sanos
age <- hd_pr %>% pull(age)
age_by_condition <- t.test(age ~ condition )
print(age_by_condition)##
## Welch Two Sample t-test
##
## data: age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.067682 6.013385
## sample estimates:
## mean in group Enfermedad mean in group No enfermedad
## 56.62590 52.58537
hd_pr %>%
ggplot(aes(x = hd_etiqueta, y = age, col = hd_etiqueta)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(width = 0.3, alpha = 0.4) +
labs(x = "", y = "edad") +
theme(legend.position = "none")2. Examina la variable ‘thalach’ entre los grupos enfermedad/sanos
tension <- hd_pr %>% pull(thalach)
thalach_by_condition <- t.test(age ~ condition )
print(thalach_by_condition)##
## Welch Two Sample t-test
##
## data: age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.067682 6.013385
## sample estimates:
## mean in group Enfermedad mean in group No enfermedad
## 56.62590 52.58537
hd_pr %>%
ggplot(aes(x = hd_etiqueta, y = age, col = hd_etiqueta)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(width = 0.3, alpha = 0.4) +
labs(x = "", y = "maximum heart rate") +
theme(legend.position = "none")3. Examina la variable ‘trestbps’ entre los grupos enfermedad/sanos
bps <- hd_pr %>% pull(trestbps)
bps_by_condition <- t.test(age ~ condition )
print(bps_by_condition)##
## Welch Two Sample t-test
##
## data: age by condition
## t = 4.0303, df = 300.93, p-value = 7.061e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2.067682 6.013385
## sample estimates:
## mean in group Enfermedad mean in group No enfermedad
## 56.62590 52.58537
hd_pr %>%
ggplot(aes(x = hd_etiqueta, y = trestbps, col = hd_etiqueta)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(width = 0.3, alpha = 0.4) +
labs(x = "", y = "blood pressure (mm Hg)") +
theme(legend.position = "none") ¿ Las medias de los grupos son o no son similares ?
hd_pr %>%
ggplot(aes(x = hd_etiqueta, y = trestbps, col = hd_etiqueta)) +
geom_boxplot(show.legend = FALSE, outlier.shape = NA) +
geom_jitter(width = 0.3, alpha = 0.4) +
labs(x = "", y = "blood pressure (mm Hg)") +
theme(legend.position = "none") +
facet_wrap(~ sex)4. Examina la variable ‘chol’ entre los grupos hombre/mujer
##
## Welch Two Sample t-test
##
## data: chol by sex
## t = -3.0643, df = 136.37, p-value = 0.002631
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -36.445477 -7.855795
## sample estimates:
## mean in group hombre mean in group mujer
## 239.6019 261.7526
5. Examina la variable ‘trestbps’ entre los grupos hombre/mujer
##
## Welch Two Sample t-test
##
## data: hd_pr$trestbps by hd_pr$sex
## t = -1.0622, df = 165.36, p-value = 0.2897
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.939856 2.084686
## sample estimates:
## mean in group hombre mean in group mujer
## 130.9126 133.3402
6. Examina la variable ‘trestbps’ entre los grupos hombre/mujer ENFERMOS
##
## Welch Two Sample t-test
##
## data: enfermedad$trestbps by enfermedad$sex
## t = -3.2448, df = 31.365, p-value = 0.002792
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.886672 -5.453679
## sample estimates:
## mean in group hombre mean in group mujer
## 131.9298 146.6000
7. Examina la variable ‘thalach’ entre los grupos hombre/mujer
##
## Welch Two Sample t-test
##
## data: hd_pr$thalach by hd_pr$sex
## t = -0.90442, df = 223.85, p-value = 0.3667
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -7.572564 2.808276
## sample estimates:
## mean in group hombre mean in group mujer
## 148.8447 151.2268
8. Examina la variable ‘thalach’ entre los grupos hombre/mujer ENFERMOS
##
## Welch Two Sample t-test
##
## data: enfermedad$thalach by enfermedad$sex
## t = -1.039, df = 39.072, p-value = 0.3052
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -14.015402 4.502419
## sample estimates:
## mean in group hombre mean in group mujer
## 138.4035 143.1600